### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2

options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        future.globals.maxSize=min(memInKB, 20 * 1024^3),
        mc.cores=min(cpus,1),
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R", 
                        "functions_biomart.R", 
                        "functions_degs.R", 
                        "functions_io.R", 
                        "functions_plotting.R", 
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
param$debugging_mode = "default_debugging"
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())
### Rmarkdown configuration
################################################################################

### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)


### R Options
options(citation_format="pandoc", 
        knitr.table.format="html",
        kableExtra_view_html=TRUE)


### Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      results = "hold",              # show results after running whole chunk
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'svg', 'tiff'),          # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
                      dev.args=list(png = list(type="cairo"),     # png: use cairo - works on cluster
                                    tiff = list(type="cairo", compression = 'zip')),  # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’ 
                      dpi=300,                       # figure resolution
                      fig.retina=2,                 # retina multiplier
                      fig.path=paste0(file.path(param$path_out, "figures"), "/")  # Path for figures in png and pdf format (trailing "/" is needed)
)

### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

### Required libraries
library(magrittr)

### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
bib_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","sc_analysis_references.bib"))
invisible(knitcitations::citep(bib_ref))

### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4
# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)
library(enrichR)

Read data

Read gene annotation

Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.

## Read gene annotation
# We read gene annotation from file. 
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names. 
# ATTENTION: ONLY for human and mouse!!!

### Set reference
################################################################################
if (param$species=="human") {
  param$mart_dataset = ifelse(is.null(param$mart_dataset), "hsapiens_gene_ensembl", param$mart_dataset)
  param$mt = ifelse(is.null(param$mt), "^MT-", param$mt)
  param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
    c("GO_Biological_Process_2023", "WikiPathway_2023_Human", "Azimuth_2023", "CellMarker_2024")
  }
  param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "BlueprintEncodeData()", param$annotation_dbs)
} else {
  if (param$species=="mouse") {
    param$mart_dataset = ifelse(is.null(param$mart_dataset), "mmusculus_gene_ensembl", param$mart_dataset)
    param$mt = ifelse(is.null(param$mt), "^mt-", param$mt)
    param$enrichr_dbs = if(is.null(param$enrichr_dbs)) {
      c("GO_Biological_Process_2023", "WikiPathways_2019_Mouse", "Azimuth_2023", "CellMarker_2024")
    }
    param$annotation_dbs = ifelse(is.null(param$annotation_dbs), "MouseRNAseqData()", param$annotation_dbs)
  } else {
    param$mart_dataset=param$mart_dataset
  }
}

# Set defaults
# Default is Ensembl release 98 which corresponds to 2020-A reference package of 10x Genomics Cell Ranger
param$annot_version = ifelse(is.null(param$annot_version), 98, param$annot_version)
param$annot_main = if(is.null(param$annot_main)) {
  c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
}
param$mart_attributes = if(is.null(param$mart_attributes)) {
  c(c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession"), 
    c("chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
}

param$path_reference = file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference = paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = ifelse(is.null(param$file_annot), file.path(param$path_reference, param$reference), param$file_annot)
param$file_cc_genes = ifelse(is.null(param$file_cc_genes), file.path(param$path_reference, "cell_cycle_markers.xlsx"), param$file_cc_genes)



### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
  param$file_annot = file.path(param$path_reference, param$reference)
  param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")
  source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}


### read_ensembl_annotation
################################################################################

# Load from file
annot_ensembl = read.delim(param$file_annot)

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main)) 
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]


### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest

# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)

# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))

Read scRNA-seq data

The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.

Pre-processing of 10x data with Cell Ranger We use the 10x Genomics Cell Ranger software to map 10x sequencing reads. The result is a count matrix that contains the number of unique transcripts per gene (rows) and cell (columns). To save storage space, the matrix is converted into a condensed format and described by the following 3 files:
  • “features.tsv.gz”: Tabular file with gene annotation (Ensembl gene ID and the gene symbol) to identify genes
  • “barcodes.tsv.gz”: File with cell barcodes to identify cells
  • “matrix.mtx.gz”: A condensed version of the count matrix
Pre-processing of SmartSeq-2 data Sequencing reads from SmartSeq-2 (and other) experiments can be mapped with any suitable mapping software as long as a count matrix is generated that contains the number of mapped reads per gene (rows) and cell (columns). The first row of the count matrix should contain cell barcodes or names, the first column of the matrix should contain Ensembl gene IDs.
What is Seurat and which information is contained in a Seurat object? Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell. Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (layer=‘counts’), normalized data (layer=‘data’), or z-scored/variance-stabilized (scaled) data (layer=‘scale.data’).
In addition, the workflow stores additional information in the Seurat object, including but not limited to dimensionality reduction and cluster results.

Here, for the project Testdata, the following data are analysed:

### Read rds object
################################################################################

### Load Seurat S4 objects 
# Test if file is defined
if (is.null(param$data)) {
  message("Dataset is not specified")
} else {
  # Test if file exists
  if (file.exists(param$data)) {
    # Read object
    message(paste0("Load dataset:", param$data))
    sc = base::readRDS(param$data)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(sc@misc[])) {
      # Retrieve previous parameter settings
      orig_param = sc@misc$parameters
      if ("SCT" %in% names(sc@assays)) {
        if ("scale.data" %in% Layers(sc[["SCT"]])) {
          orig_param$norm = "SCT"
        } else {
          orig_param$norm = "RNA"
        }
      } else {
        orig_param$norm = "RNA"
      }
      
      # Keep some parameter settings from object and project defined
      orig_param_keep = orig_param[c("annot_version", "species")]
      basic_param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
      # Test for reference concordance
      rerun_read_gene_annotation = (orig_param$species == param$species & orig_param$annot_version == param$annot_version)
      # Integrate parameter
      param = modifyList(x = param, val = orig_param)
      param = modifyList(x = param, val = basic_param_keep)
      param = modifyList(x = param, val = param_advset)
      param = modifyList(x = param, val = orig_param_keep)
    }
    
    # Rerun read_gene_annotation.R if not fitting to loaded object
    if (!rerun_read_gene_annotation == TRUE) {
      source(file.path(param$path_to_git,'scripts/read_data/read_gene_annotation.R'))
    }

    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(sc@meta.data[["orig.ident"]])) {
      if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples)) 
        sc = sample_colours_out[[1]]
        param$col_samples = sample_colours_out[[2]]
      }
    }
 
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(sc@meta.data[["seurat_clusters"]])) {
      if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters)) 
        sc = cluster_colours_out[[1]]
        param$col_clusters = cluster_colours_out[[2]]
      }
    }

    # Set colors for cell types based on annotation 
    if (!is.null(sc@meta.data[["annotation"]])) {
      if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation)) 
        sc = annotation_colours_out[[1]]
        param$col_annotation = annotation_colours_out[[2]]
      }
    }

  sc
  } else {
  message("Dataset does not exist")
  }
}

× (Message)
Load dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing/data/sc.rds

× (Message)
No or wrong number of distinct colors for clusters provieded. Generate colors using color paletteggsci::pal_igv

### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
  # Test if file exists
  if (file.exists(param$refdata)) {
   # Read object
    message(paste0("Load dataset:", param$refdata)) 
    scR = base::readRDS(param$refdata)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(scR@misc[])) {
      orig_paramR = scR@misc$parameters
      
      if (!is.null(orig_paramR$col_samples)) {
        param$col_samples_ref = orig_paramR$col_samples
      }
      if (!is.null(orig_paramR$col_clusters)) {
        param$col_clusters_ref = orig_paramR$col_clusters
      }
      if (!is.null(orig_paramR$col_annotation)) {
        param$col_annotation_ref = orig_paramR$col_annotation
      }
      param = modifyList(x = param, val = param_advset)
    }
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(scR@meta.data[["orig.ident"]])) {
      if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples)) 
        scR = sample_colours_out[[1]]
        param$col_samples_ref = sample_colours_out[[2]]
      }
    }
    
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(scR@meta.data[["seurat_clusters"]])) {
      if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters)) 
        scR = cluster_colours_out[[1]]
        param$col_clusters_ref = cluster_colours_out[[2]]
      }
    }
    
    # Set colors for cell types based on annotation 
    if (!is.null(scR@meta.data[["annotation"]])) {
      if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation)) 
        scR = annotation_colours_out[[1]]
        param$col_annotation_ref = annotation_colours_out[[2]]
      }
    }
    
  scR
  } else {
  message("Reference dataset does not exist")
  }
}  
## An object of class Seurat 
## 6439 features across 1078 samples within 1 assay 
## Active assay: RNA (6439 features, 3000 variable features)
##  3 layers present: data, counts, scale.data
##  2 dimensional reductions calculated: pca, umap
# Confirm correct setting of normalization method
# Normalization method can not be overwritten with advanced_settings as long as the object was not normalized using the respective method
if (param$norm %in% names(sc@assays)) {
  if (param$norm == "RNA") {
    if ("scale.data" %in% Layers(sc[["RNA"]])) {
      param$norm = "RNA"
    } else {
      param$norm = "SCT"
    }
  } else if (param$norm == "SCT") {
    if ("scale.data" %in% Layers(sc[["SCT"]])) {
      param$norm = "SCT"
    } else {
      param$norm = "RNA"
    }
  } else {
    param$norm = orig_param$norm
  }
} else {
  param$norm = "RNA"
}
message(paste0("Chosen normalisation method and normlized values detecte in provided object were harmonised and set to: ", ifelse(param$norm=="RNA", "Standard log normalisation", "SCTransform")))

× (Message)
Chosen normalisation method and normlized values detecte in provided object were harmonised and set to: Standard log normalisation



Clustering

The clustering method used by Seurat first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag et al. (2019)).

Further explanation on clustering

At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.

During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.
# Set default clustering
n = paste0(DefaultAssay(sc), "_snn_res.", param$cluster_resolution)
sc$seurat_clusters = sc[[n, drop=TRUE]]
Seurat::Idents(sc) = sc$seurat_clusters
if (length(levels(sc$seurat_clusters)) > 1) {
  suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Misc(sc, "trees")[[n]]))})
}

# Set up colors for default clustering
sc = ScAddLists(sc, lists=list(seurat_clusters=Misc(sc, "colour_lists")[[n]]), lists_slot="colour_lists")
param$col_clusters = Misc(sc, "colour_lists")[["seurat_clusters"]]

Visualisation with UMAP

We use a UMAP to visualise and explore the dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.

Take care not to mis-read a UMAP:

  • Parameters influence the plot (we use defaults here)
  • Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
  • Distances between clusters might not mean anything
  • You may need more than one plot

For a nice read to intuitively understand UMAP, see Coenen and Pearce (2024).
For this report, you chose the resolution value 0.6 as the final value for further analyses.

Coloured by cluster

# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p_umap = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters") + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Cells coloured by cluster identity", legend_position="", legend_title="", xlab = "UMAP 1", ylab = "UMAP 2")
p_umap = LabelClusters(p_umap, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)

sample_cells = table(sc$orig.ident)
sample_labels = paste0(levels(sc$orig.ident)," (", sample_cells[levels(sc$orig.ident)],")")
# Note: This is a hack to colour by sample but label by Cluster
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident") +
  scale_color_manual(values=param$col_samples, labels=sample_labels) +
  AddStyle(title="Cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")
p2$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p2$data), ]
p2 = LabelClusters(p2, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)

p = p_umap + p2
p

Coloured by cluster (per sample)

# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p

Coloured by sample (per sample)

# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
  scale_color_manual(values=param$col_samples, labels=sample_labels) +
  AddStyle(title="Cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p



Cluster QC

Exploration of quality control metrics: determine whether clusters are unbalanced in their number of counts, detected genes or mitochondrial content.

p_list1 = list()
qc_feature = c(paste0("nCount_", param$assay_raw), paste0("nFeature_", param$assay_raw), "percent_mt", "percent_ribo")
title = c(paste0("Summed raw counts (nCount_", param$assay_raw, ", log10 scale)"), 
            paste0("Number of features with raw count > 0 (nFeature_", param$assay_raw, ", log10 scale)"),
            "Percent of mitochondrial features (percent_mt)", 
            "Percent of ribosomal features (percent_ribo)")
if ("percent_ercc" %in% colnames(sc[[]])) {
  qc_feature = c(qc_feature, "percent_ercc")
  title = c(title, "Percent of ERCC features (percent_ercc)")
}


for (n in seq(qc_feature)) {
  p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature[n]) + 
                          AddStyle(title="Feature plot") + 
                          scale_colour_gradient(low="lightgrey", high=param$col))

   if (qc_feature[n]==paste0("nCount_", param$assay_raw) | qc_feature[n]==paste0("nFeature_", param$assay_raw)) {
     p1 = p1 + scale_colour_gradient(low="lightgrey", high=param$col, trans="log10")
   }
  p1$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p1$data), ]
  p1 = LabelClusters(p1, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
  
  
  p2 = ggplot(sc[[]], aes(x=.data[["seurat_clusters"]], y=.data[[qc_feature[n]]], fill=.data[["seurat_clusters"]], group=.data[["seurat_clusters"]])) + 
    geom_violin(scale="width") + 
    AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
           xlab="Cluster", legend_position="none")
  
  if (qc_feature[n]==paste0("nCount_", param$assay_raw) | qc_feature[n]==paste0("nFeature_", param$assay_raw)) {
    p2 = p2 + scale_y_log10()
  }
  
  
  m = c(qc_feature[n], paste0("nFeature_", param$assay_raw))
  p3 = ggplot(sc[[c(m, "seurat_clusters")]], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["seurat_clusters"]])) + 
    geom_point() + 
    AddStyle(col=param$col_clusters) + 
    facet_wrap(~seurat_clusters) +
    theme(legend.position = "none")
  

  p_list1[[n]] = (p1 | p2) / p3
  p_list1[[n]] = p_list1[[n]] + patchwork::plot_annotation(title=title[n])
}

Number of counts

print(p_list1[[1]])

Number of features

print(p_list1[[2]])

Percent mitochondrial reads

print(p_list1[[3]])

Percent ribosomal reads

print(p_list1[[4]])

Percent ERCC

if ("percent_ercc" %in% colnames(sc[[]])) {
  print(p_list1[[5]])
} else {
  message("No ERCC controls found.")
}

× (Message)
No ERCC controls found.



Cell Cycle Effect

How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? If a cell cycle score for each cell based on its expression of G2M and S phase markers was calculated, it will be displayed here.

# Set up colours for cell cycle effect and add to sc object
col =  GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")

# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) + 
  geom_point() + 
  scale_x_continuous("G1/S score") + 
  scale_y_continuous("G2/M score") + 
  AddStyle(col=Misc(sc, "colour_lists")[["Phase"]]) +
  NoLegend()

p2 = ggplot(sc@meta.data %>% 
              dplyr::group_by(seurat_clusters, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=seurat_clusters, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_x_discrete("Seurat clusters") + 
  scale_y_continuous("Fraction of cells") + 
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  NoLegend() +
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) 

p3 = ggplot(sc[[]] %>% 
              dplyr::group_by(orig.ident, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=orig.ident, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_y_continuous("Fraction of cells") +
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  NoLegend() +
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")

p23 = p2 + p3 + patchwork::plot_layout(guides = "collect") & theme(legend.position = 'bottom')
p = p1 + p23
p

if (any(!is.na(sc$Phase))) {
  
  p1 = Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) + 
    AddStyle(title="Cell cycle phases", legend_title="Phase",  xlab = "UMAP 1", ylab = "UMAP 2")
  p1$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p1$data), ]
  p1 = LabelClusters(p1, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
  
  p_list = list()
  cc_features = c("S.Score", "G2M.Score", "CC.Difference")
  title = c("S phase", "G2/M phase", "G2/M & S difference")
  
  for (n in seq(cc_features)) {
  p_list[[n]] = Seurat::FeaturePlot(sc, features=cc_features[n], pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
    AddStyle(title=title[n], xlab = "", ylab = "")
  }
  p = p1 + patchwork::wrap_plots(p_list, ncol=1) + 
    plot_layout(widths = c(3, 1)) +
    patchwork::plot_annotation(title="UMAP coloured by") 
  p
}



Distribution of cells in clusters

# Count cells per cluster per sample 
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()

# Make table
tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
tbl[,"orig.ident"] = NULL

# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)

# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters) > 1) tbl = rbind(tbl, CellsFisher(sc))

# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]

# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p1 = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="Sample identity per clusters",
           fill=param$col_samples,
           legend_title="Sample",
           legend_position="bottom")

p = p1 | p_umap
p

The following table shows the number of cells per sample and cluster:

  • n: Number of cells per sample and cluster
  • perc: Percentage of cells per sample and cluster compared to all other cells of that cluster

In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:

  • oddsRatio: Odds ratio calculated for cluster c1 and sample s1 as (# cells s1 in c1 / # cells not s1 in c1) / (# cells s1 not in c1 / # cells not s1 not in c1)
  • p: P-value calculated with a Fisher test to test whether “n” is higher or lower than expected
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per sample and cluster") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
Number of cells per sample and cluster
Cl_1 Cl_2 Cl_3 Cl_4 Cl_5 Cl_6 Cl_7
Sample1_n 88 88 75 72 54 50 18
Sample1_perc 41.51 44.22 42.37 42.35 37.76 41.67 31.58
Sample1.oddsRatio 1.01 1.16 1.06 1.05 0.84 1.02 0.64
Sample1.p 5.0e-01 2.0e-01 4.0e-01 4.1e-01 8.4e-01 5.0e-01 9.5e-01
Sample2_n 124 111 102 98 89 70 39
Sample2_perc 58.49 55.78 57.63 57.65 62.24 58.33 68.42
Sample2.oddsRatio 0.99 0.86 0.95 0.95 1.18 0.98 1.56
Sample2.p 5.6e-01 8.4e-01 6.6e-01 6.5e-01 2.0e-01 5.8e-01 8.1e-02


Fraction of clusters per sample

# Make table
tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"])
tbl[,"orig.ident"] = NULL

# Plot counts
tbl_bar = tbl[cell_samples, , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(cols = tidyselect::all_of(cell_clusters), names_to="Cluster", values_to="Counts")
tbl_bar$Counts = as.numeric(tbl_bar$Counts)
tbl_bar$Cluster <- factor(tbl_bar$Cluster,levels = cell_clusters)
tbl_bar$Sample <- factor(tbl_bar$Sample, levels = cell_samples)

# Plot counts per condition
p1 = ggplot(tbl_bar, aes(x=Sample, y=Counts, fill=Cluster)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="", xlab = "",
           fill=param$col_clusters,
           legend_position="") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Calculate percentages
tbl_perc = round(t(tbl) / colSums(t(tbl)) * 100, 2) %>% t() %>% as.data.frame()
tbl_perc = round(tbl / rowSums(tbl) * 100, 2) %>% as.data.frame()

# Plot percentages
tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(cols = tidyselect::all_of(cell_clusters), names_to="Clusters", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Clusters <- factor(tbl_bar_perc$Clusters,levels = cell_clusters)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)

p2 = ggplot(tbl_bar_perc, aes(x=Sample, y=Percentage, fill=Clusters)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="", xlab = "",
           fill=param$col_clusters,
           legend_position="") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p_umap = p_umap + 
  AddStyle(title="")

p = p1 | p2 + 
  patchwork::plot_annotation(title="Cluster identity per sample") 
p = p | p_umap
p

# Combine tables
rownames(tbl) = paste0(rownames(tbl),"_n")
rownames(tbl_perc) = paste0(rownames(tbl_perc),"_perc")
tbl = rbind(tbl, tbl_perc) 

# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]

# Print table
knitr::kable(tbl, align="l", caption="Number of cells per cluster and sample") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
Number of cells per cluster and sample
1 2 3 4 5 6 7
Sample1_n 88.00 88.00 75.00 72.00 54.00 50.00 18.00
Sample1_perc 19.78 19.78 16.85 16.18 12.13 11.24 4.04
Sample2_n 124.00 111.00 102.00 98.00 89.00 70.00 39.00
Sample2_perc 19.59 17.54 16.11 15.48 14.06 11.06 6.16
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = ifelse(param$norm=="SCT", param$norm, param$assay_raw)


Assessing cluster separation

Cluster tree

Here, the relationship (similarity) between clusters is represented in a cluster tree. The cluster tree can help to identify which clusters might be candidates for merging.

# Set seurat_clusters for as default tree
sc@tools$BuildClusterTree = sc@misc$trees$seurat_clusters

# pull the tree
cluster_tree <- Tool(object = sc, slot = "BuildClusterTree")
cluster_tree$tip.label <- paste0(" Cluster ", cluster_tree$tip.label)
# plot the tree
p1 = wrap_elements(full= ~ ape::plot.phylo(x = cluster_tree, type = "c", use.edge.length = TRUE, font = 2, cex = 1.5, tip.color = sc@misc$colour_lists$seurat_clusters, edge.width = 2))

p2 = p_umap / plot_spacer() +
  plot_layout(widths = c(1, 2))
p_list = list(p1, plot_spacer(), p2)
p = wrap_plots(p_list) + 
  plot_layout(widths = c(10, 1, 4))
p

Silhouette plot

In contrast, the silhouette plot can aid in assessing cluster assignment.

What does a silhouette plot?

The silhouette plot visualizes the separation distance between clusters, i.e. how close each point in one cluster is to points in the neighboring clusters, measured as a silhouette score or coefficients. This score has a range of [-1, 1]. A score near +1 indicate that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster. Hence, the silhouette plot can aid in assignment cluster assignement and confirming the choice of cluster resolution. Each cluster would ideally contain large positive silhouette widths, indicating that it is well-separated from other clusters. In contrast, smaller widths can arise from the presence of internal subclusters or overclustering.

# Extracting distance matrix
distance_matrix = dist(Embeddings(sc[['pca']])[, 1:param$pc_n])
clusters = sc@meta.data$seurat_clusters
silhouette = cluster::silhouette(as.numeric(clusters), dist = distance_matrix)
sc@meta.data$silhouette_score = silhouette[,3]
mean_silhouette_score = mean(sc@meta.data$silhouette_score)

# Generate table
silhouette_tbl = sc@meta.data %>%
  dplyr::mutate(barcode = rownames(.)) %>%
  dplyr::arrange(seurat_clusters,-silhouette_score) %>%
  dplyr::mutate(barcode = factor(barcode, levels = barcode))

# Plot
legend_title <- "Cluster"
p = ggplot(silhouette_tbl) +
  geom_col(aes(barcode, silhouette_score, fill = seurat_clusters), show.legend = TRUE) +
  geom_hline(yintercept = mean_silhouette_score, color = 'red', linetype = 'dashed') +
  scale_x_discrete(name = 'Cells') +
  scale_y_continuous(name = 'Silhouette score') +
  scale_fill_manual(legend_title, values = param$col_clusters) +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
p



Marker genes

We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on RNA data and the method “MAST”. Note that markers may bleed over between closely-related groups, i.e. they are not forced to be specific to only one cluster.
Resulting p-values are adjusted using the Bonferroni method. However, note that the p-values are likely inflated, since both clusters and marker genes were determined based on the same gene expression data, and there ought to be gene expression differences by design. Nevertheless, p-values can be used to sort and prioritize marker genes. We require marker genes to be expressed in at least 25% of cells in the respective cluster, with a minimum log2 fold change of 1 and adjusted p-value of at most 0.05. The results written to file “markers_cluster_vs_rest.xlsx”.

What are marker genes?

As described above, cell clusters approximate cell types and states. But how do we know which cell types these are? To characterize cell clusters, we identify marker genes. Good marker genes are genes that are particularly expressed in one cluster, and existing knowledge of these marker genes can be used to extrapolate biological functions for the cluster. A good clustering of cells typically results in good marker genes. Hence, if you cannot find good marker genes you may need to go back to the start of the workflow and adapt your parameters. Note that we also determine genes that are particularly down-regulated in one cluster, even if these are not marker genes in the classical sense.

Good marker genes are highly and possibly even only expressed in one cluster as compared to all other clusters. However, sometimes marker genes are also expressed in other clusters, or are declared as marker genes in these clusters, for example cell lineage markers that are shared by different cell subtypes. To evaluate marker genes, it is essential to visualize their expression patterns.

In addition to detecting marker genes, it might be informative to detect genes that are differentially expressed between one specific cluster and one or several other clusters. This approach allows a more specific distinction of individual clusters and investigation of more subtle differences, see the section “Differentially expressed genes” below.
# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells 
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers 

# Review recommends using "MAST"; Or "wilcox" or "LR"
# ALWAYS USE: assay="RNA"/"Spatial" or assay="SCT"
# DONT USE: assay=integrated datasets
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
#   https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
DefaultAssay(sc) = param$norm

if (param$norm=="SCT" & param$experimental_groups=="homogene") {
  sc = PrepSCTFindMarkers(sc)
}

markers = Seurat::FindAllMarkers(sc, assay=param$norm, test.use="MAST", only.pos=FALSE,
                                 min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
                                 latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE)

# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))

# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
  lfc_idx = grep("avg_log\\S*FC", colnames(markers))
  markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
  col_nms = colnames(markers)
  col_nms[2] = "avg_log2FC"
  colnames(markers) = col_nms
}

# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
  
# Filter markers 
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0

# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene, assay=param$assay_raw))

# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
                                        "p_val_adj_score",
                                        "avg_<assay>_<slot>_id<cluster>"), 
                               Description=c("Cluster",
                                             "Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
                                             "Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))


# Create directory
if (!file.exists(file.path(param$path_out, "data", "marker_genes"))) dir.create(file.path(param$path_out, "data", "marker_genes"), recursive=TRUE, showWarnings=FALSE)

invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
                                       annot_ensembl=annot_ensembl,
                                       gene_to_ensembl=seurat_rowname_to_ensembl,
                                       additional_readme=additional_readme,
                                       file=file.path(param$path_out, "data", "marker_genes", "markers_cluster_vs_rest.xlsx")))


# Plot number of differentially expressed genes
p = DegsPlotNumbers(markers_filt$all, 
                      group="cluster", 
                      title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")")) 

# Add marker table to seurat object
Seurat::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj, 
                                   log2FC=param$marker_log2FC, min_pct=param$marker_pct, assay=param$assay_raw, slot="data",
                                   latent_vars=param$latent_vars,
                                   results=markers_filt$all,
                                   enrichr=EmptyEnrichrDf(overlap_split=TRUE))

# Add marker lists to seurat object
marker_genesets_up = split(markers_filt$up$gene, markers_filt$up$cluster)
names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
marker_genesets_down = split(markers_filt$down$gene, markers_filt$down$cluster)
names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
sc = ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")

if (markers_found) {
  p
} else {
  warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}


Table of top marker genes

We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.

if (markers_found) {
  markers_top = DegsUpDisplayTop(markers_filt$up, n=5)
  markers_top_6 = DegsUpDisplayTop(markers_filt$up, n=6)
  markers_top_10 = DegsUpDisplayTop(markers_filt$up, n=10)
  
  # Add labels
  markers_top$labels = paste0(markers_top$cluster, ": ", markers_top$gene)

  # Show table
  knitr::kable(markers_top %>% dplyr::select(-labels), align="l", caption="Up to top 5 marker genes per cell cluster") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="500px") 
}
Up to top 5 marker genes per cell cluster
cluster gene avg_log2FC p_val p_val_adj pct.1 pct.2
1 NRGN 4.004 5.7e-72 3.7e-68 0.066 0.294
1 RPS12 1.046 2.1e-71 1.3e-67 0.958 0.997
1 PIK3IP1 2.142 2.3e-52 1.5e-48 0.774 0.297
1 CD3D 1.393 2.5e-52 1.6e-48 0.877 0.311
1 CCR7 2.812 1.7e-50 1.1e-46 0.627 0.127
2 CST3 2.611 3.3e-113 2.1e-109 1.000 0.223
2 TGFBI 2.480 1.7e-104 1.1e-100 0.945 0.155
2 PSAP 2.304 1.0e-103 6.4e-100 0.995 0.512
2 KLF4 1.789 9.5e-94 6.1e-90 0.955 0.206
2 CCDC88A 2.564 1.3e-93 8.5e-90 0.920 0.164
3 IGHM.1 8.981 1.5e-172 9.3e-169 0.870 0.024
3 CD79B 5.639 3.1e-170 2.0e-166 0.972 0.104
3 LINC00926 8.034 1.5e-155 1.0e-151 0.881 0.014
3 CD22 8.825 1.3e-144 8.5e-141 0.831 0.010
3 HLA-DQA1 3.659 2.3e-125 1.5e-121 0.966 0.202
4 S100A8 4.066 2.1e-132 1.4e-128 1.000 0.248
4 S100A9 3.161 1.2e-109 7.8e-106 1.000 0.290
4 MNDA 2.918 4.1e-105 2.7e-101 0.988 0.219
4 LYZ 2.630 9.8e-100 6.3e-96 1.000 0.308
4 FCN1 2.448 1.8e-94 1.2e-90 0.988 0.209
5 CD3D 1.552 5.6e-58 3.6e-54 0.993 0.335
5 IL7R 2.083 9.0e-56 5.8e-52 0.944 0.305
5 TRAC 1.684 9.5e-54 6.1e-50 0.986 0.385
5 CD3G 1.623 1.1e-45 6.8e-42 0.909 0.294
5 CD2 1.908 5.0e-38 3.2e-34 0.811 0.262
6 GZMK 4.865 6.9e-87 4.4e-83 0.842 0.050
6 GZMA 2.783 3.5e-84 2.2e-80 0.950 0.105
6 NKG7 2.258 9.4e-82 6.0e-78 0.967 0.135
6 KLRG1 3.554 6.0e-78 3.9e-74 0.900 0.120
6 CTSW 2.375 1.1e-70 7.1e-67 0.983 0.215
7 KLRD1 5.450 1.8e-74 1.2e-70 1.000 0.049
7 KLRF1 6.473 4.5e-73 2.9e-69 0.930 0.035
7 PRF1 5.087 2.6e-72 1.7e-68 1.000 0.126
7 CTSW 3.865 4.4e-67 2.8e-63 1.000 0.262
7 NKG7 4.751 9.9e-67 6.4e-63 1.000 0.184


Visualisation of top marker genes

The following plots visualize the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.

# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {

  # Feature plots and violin plots: each row contains 3 plots
  #   The plot has 5 columns and 1 rows per cluster, hence the layout works nicely if we find 
  height_per_row = 2.5
  nr_rows_5cols = ceiling(nrow(markers_top)/5)
  fig_height_5cols = height_per_row * nr_rows_5cols
  
  # Violin plots and violin plots: each row contains 3 plots
  #   The plot has 4 columns and 2 rows per cluster, hence the layout works nicely if we find 
  #     at least 6 markers per cluster
  height_per_row = 1.5
  nr_rows_3cols = ceiling(nrow(markers_top_6)/3)
  fig_height_3cols = height_per_row * nr_rows_3cols
  
  # Dotplots: each row contains 2 plots
  # The height of dotplots is dependent on the number of clusters
  height_per_row_variable = max(3, 0.3 * length(levels(sc$seurat_clusters)))
  nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
  fig_height_dp_2cols = height_per_row_variable * nr_rows_dp_2cols %>% min(100)
  
} else {
  fig_height_5cols = nr_rows_3cols = fig_height_dp_2cols = 7 
}

Feature plots

if (markers_found) {
  # Plot each marker one by one, and then combine them all at the end
  p_list = list()
  for (i in 1:nrow(markers_top)) { 
    p_list[[i]] = Seurat::FeaturePlot(sc, features=markers_top$gene[i], 
                                      cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),  
                                      combine=TRUE, label=TRUE) + 
      AddStyle(title=markers_top$labels[i], 
               xlab="", ylab="", 
               legend_position="bottom")
  }
  
  # Combine all plots
  p = patchwork::wrap_plots(p_list, ncol=5) + 
    patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
  p
}

Violin plots (normalised)

if (markers_found) {
  # Plot violin plots per marker gene, and combine it all at the end
  # This layout works out nicely if there are 4 marker genes per cluster
  p_list = list()
  for(i in 1:nrow(markers_top_6)) { 
    p_list[[i]] = Seurat::VlnPlot(sc, features=markers_top_6$gene[i], assay=param$norm, pt.size=0, cols=param$col_clusters) + 
      AddStyle(title=markers_top_6$labels[i], xlab="") + 
      theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) 
  }
  p = patchwork::wrap_plots(p_list, ncol=3) + 
    patchwork::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
  p
}

Dot plot (scaled)

What shows a dotplot? A dot plot visualizes how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene (column), we group cells based on cluster identity (rows), calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.
if (markers_found) {
  # Visualises how feature expression changes across different clusters
  # Plot dotplots per cluster, and combine it all at the end
  p_list = lapply(markers_top_10$cluster %>% sort() %>% unique(), function(cl) {
    genes = markers_top_10 %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
    p = suppressMessages(Seurat::DotPlot(sc, features=genes) + 
                           scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") +
                           AddStyle(title=paste0("Top marker genes in cluster ", cl), ylab="Cluster", xlab = "", legend_position="top") + 
                           theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
                           guides(size=guide_legend(order=1)))
    return(p)
  })
  
  p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot with top 10 up-regulated marker genes (scaled)")
  p
}



Expression per cluster per sample

If the dataset contains multiple samples, we visualize the expression of marker genes that are up-regulated in a cluster separately for each sample.

#fig_height_degs_per_cl = max(5, 
#                             max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 1) # We multiply by 2, as the legend requires quite some space

height_per_row_variable = max(3, 0.3 * (sc$orig.ident %>% unique() %>% length()))
nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
fig_height_degs_per_cl = height_per_row_variable * nr_rows_dp_2cols %>% min(100)

Non-scaled dotplots

First, we plot normalized expression with no further scaling. This plot helps to get an impression of the total expression of a gene.

if (markers_found) {
  n_genes_max_dotplot = 25
  p_list = list()
  for (cl in levels(sc$seurat_clusters)) {            
    markers_filt_up_cl_top = markers_filt$up %>% 
      dplyr::filter(cluster==cl) %>% 
      dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>% 
      dplyr::pull(gene)
    if (length(markers_filt_up_cl_top) > 0) {
      p_list[[cl]] = DotPlotUpdated(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("navy", "steelblue", "darkgoldenrod1")) +
        AddStyle(title=paste0("Cluster ", cl), ylab="", xlab = "", legend_position="top") + 
        theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) + 
        guides(size=guide_legend(order=1))
    }
  }
  
  p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot per cluster with top 25 up-regulated marker genes (not scaled)") 
  p
}

Scaled dotplots

Second, we plot scaled expression as explained above. This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples. However, for small sample numbers, this plot might be misleading as differences can appear inflated.

if (markers_found) {
  p_list = list()
  markers_filt_up_top = DegsUpDisplayTop(degs=markers_filt$up, n=25)
  for (cl in levels(sc$seurat_clusters)) {  
    markers_filt_up_cl_top = markers_filt_up_top %>% 
      dplyr::filter(cluster==cl) %>% 
      dplyr::pull(gene)

    if (length(markers_filt_up_cl_top) > 0) {
      p_list[[cl]] = suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
        scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") + 
        #viridis::scale_color_viridis(option = "D") +
        AddStyle(title=paste0("Cluster ", cl), ylab="", xlab = "", legend_position="top") + 
        theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) + 
        guides(size=guide_legend(order=1)))
    }
  }
  p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Dotplot per cluster with top 25 up-regulated marker genes (scaled)") 
  p
}


Functional enrichment analysis

To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms among up- and down-regulated genes of each cluster. We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (https://amp.pharm.mssm.edu/Enrichr/). You can choose to test functional enrichment from a wide range of databases. Over-represented terms are written to files “functions_marker_up_cluster_X_vs_rest.xlsx” and “functions_marker_down_cluster_X_vs_rest.xlsx”.

# Set Enrichr database
enrichR::setEnrichrSite(param$enrichr_site)

if (markers_found) {
  
  # Create directory
  if (!file.exists(file.path(param$path_out, "data", "marker_genes"))) dir.create(file.path(param$path_out, "data", "marker_genes"), recursive=TRUE, showWarnings=FALSE)

  
  # Upregulated markers
  
  # Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  # Tests done by Enrichr
  marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
    return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
    }))
  })

  # Write to files
  invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
                        file=file.path(param$path_out, "data", "marker_genes", paste0("functions_marker_up_cluster_", n, "_vs_rest.xlsx")))
  }))
  
  
  # Downregulated markers
  
  # Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  #  Tests done by Enrichr
  marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
    return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
    }))
  })
  
  # Write to files
  invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
                        file=file.path(param$path_out, "data", "marker_genes", paste0("functions_marker_down_cluster_", n, "_vs_rest.xlsx")))
  }))
  
  # Combine, flatten into data.frame and add to sc misc slot
  marker_genesets_enriched = c(marker_genesets_up_enriched, marker_genesets_down_enriched)
  marker_genesets_enriched = unname(marker_genesets_enriched)
  marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
  marker_genesets_enriched$Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
  marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
  
  misc_content = Misc(sc, "markers")
  misc_content[["enrichr"]] = marker_genesets_enriched
  suppressWarnings({Misc(sc, "markers") = misc_content})
}

The following table contains the top enriched terms per cluster and utilized database (GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024).

# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
  
  # Get top ten up and down over all databases per cluster
  marker_genesets_top_enriched = marker_genesets_enriched %>% dplyr::group_by(Cluster, Database) %>% dplyr::filter(Direction=="up") %>%
    dplyr::top_n(n=5, wt=Combined.Score)

  # Print as tabs
  enr_table = list()
  #cat("### {.tabset} \n \n")
  
  for(n in levels(marker_genesets_top_enriched$Cluster)){
    #cat("#### ", n, " \n")
    enr_table[[n]] = marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>%
      dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score)
    #enr_table = marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>%
    #  dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score)
    #print(knitr::kable(enr_table, align="l", caption="Top ten enriched terms per geneset", format="html") %>% 
    #kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    #kableExtra::scroll_box(width="100%", height="700px"))
    #cat(" \n \n")
  }
  
  #cat(" \n \n")
  concat_enr_table = data.table::rbindlist(enr_table, use.names = TRUE, idcol = "Cluster")
  
 knitr::kable(concat_enr_table, align="l", caption="Top ten enriched terms per geneset", format="html") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="500px")
}
Top ten enriched terms per geneset
Cluster Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
1 GO_Biological_Process_2023 T Cell Activation (GO:0042110) up 0.0002176 24.649689 361.82784
1 GO_Biological_Process_2023 Immunological Synapse Formation (GO:0001771) up 0.0356235 159.544000 1417.98073
1 WikiPathway_2023_Human Modulators Of TCR Signaling And T Cell Activation WP5072 up 0.0429244 20.995778 158.11260
1 WikiPathway_2023_Human Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 up 0.0429244 41.955790 277.26135
1 WikiPathway_2023_Human T Cell Activation SARS CoV 2 WP5098 up 0.0429244 14.307083 92.49825
1 WikiPathway_2023_Human Cancer Immunotherapy By PD 1 Blockade WP4585 up 0.0429244 37.956190 243.88595
1 Azimuth_2023 Bone Marrow-L2-CD4 Naive up 0.0000000 867.173913 27391.85175
1 Azimuth_2023 PBMC-L2-CD4+ Naive T up 0.0000000 530.425532 13329.95772
1 Azimuth_2023 Bone Marrow-L1-CD4 T up 0.0000000 530.425532 13329.95772
1 Azimuth_2023 PBMC-L2-CD8+ Naive T up 0.0000001 332.383333 6348.60155
1 Azimuth_2023 Adipose-L1-T up 0.0000001 332.383333 6348.60155
1 Azimuth_2023 PBMC-L3-CD4+ Naive T up 0.0000001 332.383333 6348.60155
1 Azimuth_2023 Bone Marrow-L2-CD8 Naive up 0.0000001 332.383333 6348.60155
1 Azimuth_2023 Lung v1-L1-CD4 T up 0.0000001 332.383333 6348.60155
1 CellMarker_2024 Naive CD8+ T Cell Peripheral Blood Human up 0.0000000 133.150409 8832.83924
1 CellMarker_2024 Naive CD4+ T Cell Peripheral Blood Human up 0.0000000 148.903654 5318.49258
1 CellMarker_2024 T Cell Bone Marrow Human up 0.0000000 182.375163 5355.60983
1 CellMarker_2024 T Cell Stomach Human up 0.0000000 144.420290 3500.41482
1 CellMarker_2024 T Cell Lymphoid Tissue Human up 0.0000168 244.200000 3394.17410
2 GO_Biological_Process_2023 Immune Response-Inhibiting Cell Surface Receptor Signaling Pathway (GO:0002767) up 0.0012019 45.170901 502.71042
2 GO_Biological_Process_2023 Positive Regulation Of CD4-positive, CD25-positive, Alpha-Beta Regulatory T Cell Differentiation (GO:0032831) up 0.0044526 67.607143 622.50307
2 GO_Biological_Process_2023 Regulation Of Metallopeptidase Activity (GO:1905048) up 0.0071789 45.069124 384.47910
2 GO_Biological_Process_2023 Toll-Like Receptor 3 Signaling Pathway (GO:0034138) up 0.0071789 45.069124 384.47910
2 GO_Biological_Process_2023 Positive Regulation Of Lipopolysaccharide-Mediated Signaling Pathway (GO:0031666) up 0.0071789 45.069124 384.47910
2 WikiPathway_2023_Human TYROBP Causal Network In Microglia WP3945 up 0.0000034 11.244562 211.56945
2 WikiPathway_2023_Human Degradation Pathway Of Sphingolipids Including Diseases WP4153 up 0.0002658 24.744147 340.39054
2 WikiPathway_2023_Human IL1 And Megakaryocytes In Obesity WP2865 up 0.0122561 11.905458 105.08445
2 WikiPathway_2023_Human Activation Of NLRP3 Inflammasome By SARS CoV 2 WP4876 up 0.0243088 33.800115 269.98228
2 WikiPathway_2023_Human Macrophage Markers WP4146 up 0.0398347 22.531106 160.98031
2 Azimuth_2023 Adipose-L2-hMono1 up 0.0000000 159.217442 3704.48949
2 Azimuth_2023 Adipose-L1-Monocyte up 0.0000008 90.765661 1688.27078
2 Azimuth_2023 Liver-L2-Monocyte up 0.0000008 90.765661 1688.27078
2 Azimuth_2023 PBMC-L1-Monocyte up 0.0000141 56.594329 813.63816
2 Azimuth_2023 PBMC-L1-dendritic Cell up 0.0000141 56.594329 813.63816
2 Azimuth_2023 PBMC-L2-CD14+ Monocyte up 0.0000141 56.594329 813.63816
2 Azimuth_2023 PBMC-L3-CD14+ Monocyte up 0.0000141 56.594329 813.63816
2 Azimuth_2023 Fetal Development-L2-Pancreas-Myeloid Cells up 0.0000141 56.594329 813.63816
2 Azimuth_2023 Tonsil-L2-Neutrophil Granulocytes up 0.0000141 56.594329 813.63816
2 Azimuth_2023 Lung v1-L2-Nonclassical Monocyte up 0.0000141 56.594329 813.63816
2 Azimuth_2023 Adipose-L2-hMono2 up 0.0000141 56.594329 813.63816
2 Azimuth_2023 Liver-L1-Myeloid up 0.0000141 56.594329 813.63816
2 CellMarker_2024 Monocyte Fetal Kidney Human up 0.0000000 17.588173 4631.13519
2 CellMarker_2024 CD1C-CD141- Dendritic Cell Blood Human up 0.0000000 20.532637 3310.47008
2 CellMarker_2024 CD1C+ B Dendritic Cell Blood Human up 0.0000000 32.355572 1650.02841
2 CellMarker_2024 Mononuclear Phagocyte Sputum Human up 0.0000000 106.139535 2343.75702
2 CellMarker_2024 Monocyte Undefined Human up 0.0000000 272.324826 5731.89220
3 GO_Biological_Process_2023 Immunoglobulin Mediated Immune Response (GO:0016064) up 0.0000009 48.710379 1020.91678
3 GO_Biological_Process_2023 Immunoglobulin Production Involved In Immunoglobulin-Mediated Immune Response (GO:0002381) up 0.0000332 56.278345 896.55833
3 GO_Biological_Process_2023 MHC Class II Protein Complex Assembly (GO:0002399) up 0.0001672 62.548032 831.97054
3 GO_Biological_Process_2023 Peptide Antigen Assembly With MHC Class II Protein Complex (GO:0002503) up 0.0001672 62.548032 831.97054
3 GO_Biological_Process_2023 Positive Regulation Of CD4-positive, CD25-positive, Alpha-Beta Regulatory T Cell Differentiation (GO:0032831) up 0.0171230 102.666667 798.18816
3 WikiPathway_2023_Human B Cell Receptor Signaling Pathway WP23 up 0.0000351 15.319396 239.69336
3 WikiPathway_2023_Human Allograft Rejection WP2328 up 0.0002293 13.457248 175.97467
3 WikiPathway_2023_Human Ebola Virus Infection In Host WP4217 up 0.0016554 9.137295 97.71566
3 WikiPathway_2023_Human Prion Disease Pathway WP3995 up 0.0466664 16.034483 108.03438
3 WikiPathway_2023_Human Genetic Causes Of Porto Sinusoidal Vascular Disease WP5269 up 0.0496391 13.673024 86.31243
3 Azimuth_2023 PBMC-L1-B Cell up 0.0000000 197.073413 4020.91851
3 Azimuth_2023 PBMC-L1-dendritic Cell up 0.0000000 197.073413 4020.91851
3 Azimuth_2023 Adipose-L2-hcDC2 up 0.0000000 197.073413 4020.91851
3 Azimuth_2023 Liver-L1-B up 0.0000000 197.073413 4020.91851
3 Azimuth_2023 Liver-L2-B up 0.0000000 197.073413 4020.91851
3 Azimuth_2023 Bone Marrow-L2-Naive B up 0.0000000 197.073413 4020.91851
3 CellMarker_2024 B Cell Kidney Human up 0.0000000 61.235433 13077.25267
3 CellMarker_2024 B Cell Skin Mouse up 0.0000000 97.642623 3078.98611
3 CellMarker_2024 B Cell Lung Human up 0.0000000 158.904000 3732.48288
3 CellMarker_2024 Follicular B Cell Spleen Human up 0.0000000 394.186508 8744.79847
3 CellMarker_2024 B Cell Lung Mouse up 0.0000464 232.816406 2983.51552
3 CellMarker_2024 Naive B Cell Spleen Human up 0.0000464 232.816406 2983.51552
4 GO_Biological_Process_2023 Vacuolar Acidification (GO:0007035) up 0.0001630 18.153938 261.97013
4 GO_Biological_Process_2023 Leukocyte Aggregation (GO:0070486) up 0.0006864 75.488953 956.76985
4 GO_Biological_Process_2023 Lysosomal Lumen Acidification (GO:0007042) up 0.0013799 21.827479 248.27203
4 GO_Biological_Process_2023 Regulation Of Leukocyte Degranulation (GO:0043300) up 0.0087622 42.338362 365.53648
4 GO_Biological_Process_2023 p38MAPK Cascade (GO:0038066) up 0.0116901 33.868966 276.93909
4 WikiPathway_2023_Human TYROBP Causal Network In Microglia WP3945 up 0.0000003 14.159292 301.38896
4 WikiPathway_2023_Human Degradation Pathway Of Sphingolipids Including Diseases WP4153 up 0.0013215 23.647640 276.32871
4 WikiPathway_2023_Human Activation Of NLRP3 Inflammasome By SARS CoV 2 WP4876 up 0.0064493 42.338362 365.53648
4 WikiPathway_2023_Human Macrophage Markers WP4146 up 0.0099703 28.222701 219.69663
4 WikiPathway_2023_Human COVID 19 And Endothelial Cell Senescence WP5256 up 0.0268984 37.528176 218.44006
4 Azimuth_2023 PBMC-L2-CD14+ Monocyte up 0.0000001 113.889855 2267.82956
4 Azimuth_2023 PBMC-L3-CD14+ Monocyte up 0.0000001 113.889855 2267.82956
4 Azimuth_2023 Adipose-L2-hMono1 up 0.0000001 113.889855 2267.82956
4 Azimuth_2023 Liver-L2-Monocyte up 0.0000001 113.889855 2267.82956
4 Azimuth_2023 Bone Marrow-L2-CD14 Monocyte up 0.0000001 113.889855 2267.82956
4 CellMarker_2024 Monocyte Fetal Kidney Human up 0.0000000 27.805576 9065.47514
4 CellMarker_2024 CD1C-CD141- Dendritic Cell Blood Human up 0.0000000 15.686061 1609.52197
4 CellMarker_2024 CD1C+ B Dendritic Cell Blood Human up 0.0000000 51.882801 3493.23744
4 CellMarker_2024 Macrophage Rectum Human up 0.0000001 113.889855 2267.82956
4 CellMarker_2024 Monocyte Undefined Human up 0.0000011 141.958092 2445.41040
5 GO_Biological_Process_2023 T Cell Activation (GO:0042110) up 0.0000000 34.321799 820.11926
5 GO_Biological_Process_2023 T-helper Cell Differentiation (GO:0042093) up 0.0020081 69.912281 755.45170
5 GO_Biological_Process_2023 Negative Regulation Of Phosphatidylinositol 3-Kinase Signaling (GO:0014067) up 0.0108912 98.192118 791.92876
5 GO_Biological_Process_2023 Regulation Of Transmembrane Transporter Activity (GO:0022898) up 0.0108912 98.192118 791.92876
5 GO_Biological_Process_2023 Response To Interleukin-4 (GO:0070670) up 0.0108912 98.192118 791.92876
5 WikiPathway_2023_Human Modulators Of TCR Signaling And T Cell Activation WP5072 up 0.0000056 40.171717 699.28370
5 WikiPathway_2023_Human Th17 Cell Differentiation Pathway WP5130 up 0.0002028 28.232955 370.64784
5 WikiPathway_2023_Human Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 up 0.0016753 58.251462 601.36439
5 WikiPathway_2023_Human Cancer Immunotherapy By PD 1 Blockade WP4585 up 0.0017770 52.421053 526.38584
5 WikiPathway_2023_Human T Cell Receptor And Co Stimulatory Signaling WP2583 up 0.0018068 41.926316 395.66236
5 Azimuth_2023 Bone Marrow-L1-CD4 T up 0.0000000 738.407407 22660.76312
5 Azimuth_2023 Bone Marrow-L2-CD8 Memory up 0.0000000 453.090909 11050.61134
5 Azimuth_2023 Lung v1-L1-CD4 T up 0.0000000 453.090909 11050.61134
5 Azimuth_2023 PBMC-L2-CD4+ Naive T up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 Adipose-L1-T up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 PBMC-L3-CD4+ Central Memory T 2 up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 Bone Marrow-L1-Other T up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 Bone Marrow-L2-CD4 Memory up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 Bone Marrow-L2-CD4 Naive up 0.0000001 284.785714 5272.34315
5 Azimuth_2023 Bone Marrow-L2-Mucosal Associated Invariant T up 0.0000001 284.785714 5272.34315
5 CellMarker_2024 T Cell Bone Marrow Human up 0.0000000 191.576923 6415.61849
5 CellMarker_2024 Naive T(Th0) Cell Colorectum Human up 0.0000000 712.071429 14693.53180
5 CellMarker_2024 Naive CD4+ T Cell Blood Human up 0.0000001 356.000000 6799.21697
5 CellMarker_2024 T Cell Lymphoid Tissue Human up 0.0000001 356.000000 6799.21697
5 CellMarker_2024 T Cell Lymphoid Tissue Mouse up 0.0000040 524.684210 7964.59750
6 GO_Biological_Process_2023 T Cell Activation (GO:0042110) up 0.0000006 23.951652 503.20780
6 GO_Biological_Process_2023 Immunological Synapse Formation (GO:0001771) up 0.0004239 189.056962 2454.66207
6 GO_Biological_Process_2023 Alpha-Beta T Cell Activation (GO:0046631) up 0.0035876 62.993671 657.80825
6 GO_Biological_Process_2023 Cellular Response To Copper Ion (GO:0071280) up 0.0057912 44.454952 424.04507
6 GO_Biological_Process_2023 Positive Regulation Of Antigen Receptor-Mediated Signaling Pathway (GO:0050857) up 0.0057912 44.454952 424.04507
6 GO_Biological_Process_2023 Lens Fiber Cell Differentiation (GO:0070306) up 0.0385587 62.218750 449.24808
6 WikiPathway_2023_Human Pathogenesis Of SARS CoV 2 Mediated By Nsp9 Nsp10 Complex WP4884 up 0.0000020 80.771104 1432.74689
6 WikiPathway_2023_Human Cancer Immunotherapy By PD 1 Blockade WP4585 up 0.0000020 71.789322 1237.77880
6 WikiPathway_2023_Human Modulators Of TCR Signaling And T Cell Activation WP5072 up 0.0000076 28.511483 442.33145
6 WikiPathway_2023_Human T Cell Receptor Signaling Pathway WP69 up 0.0000196 22.710526 324.11319
6 WikiPathway_2023_Human T Cell Receptor And Co Stimulatory Signaling WP2583 up 0.0001236 42.508547 518.98916
6 Azimuth_2023 Liver-L2-CD8 T up 0.0000000 929.413333 32681.07141
6 Azimuth_2023 PBMC-L3-Mucosal Associated Invariant T up 0.0000000 524.078947 15065.37560
6 Azimuth_2023 Liver-L1-T/NK up 0.0000000 524.078947 15065.37560
6 Azimuth_2023 Bone Marrow-L2-Mucosal Associated Invariant T up 0.0000000 524.078947 15065.37560
6 Azimuth_2023 Lung v1-L1-CD8 T up 0.0000000 524.078947 15065.37560
6 CellMarker_2024 T Cell Bone Marrow Human up 0.0000000 284.371429 14796.74226
6 CellMarker_2024 CD8+ T Cell Undefined Human up 0.0000000 306.832192 12228.05918
6 CellMarker_2024 Naive CD8+ T Cell Bile Duct Human up 0.0000000 646.623377 15887.39418
6 CellMarker_2024 Cytotoxic T Cell Peripheral Blood Human up 0.0000000 646.623377 15887.39418
6 CellMarker_2024 Memory CD8+ T Cell Peripheral Blood Human up 0.0000000 1021.384615 20892.31460
7 GO_Biological_Process_2023 Natural Killer Cell Mediated Immunity (GO:0002228) up 0.0053425 23.303803 276.57638
7 GO_Biological_Process_2023 Negative Regulation Of Natural Killer Cell Mediated Cytotoxicity (GO:0045953) up 0.0142139 26.312000 265.80056
7 GO_Biological_Process_2023 Type II Interferon-Mediated Signaling Pathway (GO:0060333) up 0.0267576 39.322709 343.52501
7 GO_Biological_Process_2023 T-helper Cell Lineage Commitment (GO:0002295) up 0.0267576 33.703472 282.73146
7 GO_Biological_Process_2023 Negative Regulation Of Natural Killer Cell Mediated Immunity (GO:0002716) up 0.0267576 33.703472 282.73146
7 WikiPathway_2023_Human Proteasome Degradation WP183 up 0.0000339 13.649365 222.35594
7 WikiPathway_2023_Human Interactions Of Natural Killer Cells In Pancreatic Cancer WP5092 up 0.0002375 21.690616 296.10146
7 WikiPathway_2023_Human Cancer Immunotherapy By PD 1 Blockade WP4585 up 0.0009085 22.008032 256.03162
7 WikiPathway_2023_Human Iron Sulfur Cluster Biogenesis WP5152 up 0.0255013 19.655379 139.61244
7 WikiPathway_2023_Human TCA Cycle Nutrient Use And Invasiveness Of Ovarian Cancer WP2868 up 0.0343871 52.230159 337.35040
7 Azimuth_2023 Liver-L2-NK up 0.0000000 279.773279 7577.37452
7 Azimuth_2023 Bone Marrow-L2-Innate Lymphoid Cell up 0.0000000 159.217742 3480.04251
7 Azimuth_2023 Lung V2 (HLCA)-ann Level 4-NK Cells up 0.0000011 99.106426 1692.28584
7 Azimuth_2023 PBMC-L2-gamma-delta T up 0.0000011 99.106426 1692.28584
7 Azimuth_2023 Lung V2 (HLCA)-ann Finest level-NK Cells up 0.0000011 99.106426 1692.28584
7 Azimuth_2023 PBMC-L3-CD8+ Effector Memory T 2 up 0.0000011 99.106426 1692.28584
7 Azimuth_2023 Lung v1-L1-Natural Killer up 0.0000011 99.106426 1692.28584
7 Azimuth_2023 Lung V2 (HLCA)-ann Level 3-Innate Lymphoid Cell NK up 0.0000011 99.106426 1692.28584
7 CellMarker_2024 Cytotoxic CD4+ T Cell Liver Human up 0.0000000 40.374905 2793.55992
7 CellMarker_2024 Cytotoxic Cell Stomach Human up 0.0000000 90.633674 2667.11335
7 CellMarker_2024 Cytotoxic T Cell Colorectum Human up 0.0000002 198.232932 3735.97305
7 CellMarker_2024 Cytotoxic T Cell Brain Human up 0.0000034 315.920000 5019.58323
7 CellMarker_2024 T helper(Th) Cell Brain Mouse up 0.0000079 157.952000 2337.72120




Cell type annotation with SingleR

Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Given a reference dataset (of samples from single-cell or bulk RNA sequencing) with known labels, it is possible to assign labels to the cells from a test dataset based on similarity to that reference. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from the reference dataset to the test dataset in an automated manner and aid in cluster identification.

Here, we performed cell and cluster annotation with cell types from the reference dataset BlueprintEncodeData() obtained from celldex (https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html). The celldex package provides access to several reference datasets (mostly derived from bulk RNA-seq or microarray data).

# Required libraries
library(SingleR)
library(celldex)

# Load reference datasets
# Reference dataset obtained from celldex for singleR
# https://bioconductor.org/packages/3.14/data/experiment/vignettes/celldex/inst/doc/userguide.html
# Function to paste reference name and download reference dataset
str_eval=function(x) {return(eval(parse(text=x)))}
ref = str_eval(param$annotation_dbs)
ref_name = gsub("[()]","",param$annotation_dbs)
# Annotate cells and clusters using SingleR with reference dataset form celldex
sce_ann_cells = SingleR::SingleR(test = GetAssayData(sc, assay = param$norm, slot = 'data'),
  ref = ref, assay.type.test = 1, labels = ref@colData@listData$label.main)

sce_ann_clusters = SingleR::SingleR(test = GetAssayData(sc, assay = param$norm, slot = 'data'),
  ref = ref, assay.type.test = 1, labels = ref@colData@listData$label.main, clusters = sc$seurat_clusters)

annotated_cells = table(sce_ann_cells$labels)
annotated_clusters = table(sce_ann_clusters$labels)

sc[["SingleR.labels"]] = sce_ann_cells$labels
sc[["SingleR.cluster.labels"]] = sce_ann_clusters$labels[match(sc[[]][["seurat_clusters"]], rownames(sce_ann_clusters))]

Annotation of single cells

UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cell. The annotation of each cell is more sensitive, but also more prone to artefacts compared to the annotation of clusters as performed in later steps. Here, we perform annotation of single cells for annotation diagnostics, that means for assessment of the reliability of cell type annotation and how close all cells resemble the cell types of the reference dataset.

# Print table with cell annotation
knitr::kable(annotated_cells, align="l", caption="Cell types assigned to cells", format="html") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="300px")
Cell types assigned to cells
Var1 Freq
B-cells 180
CD4+ T-cells 218
CD8+ T-cells 232
Erythrocytes 5
HSC 7
Mesangial cells 1
Monocytes 368
Neutrophils 1
NK cells 66
# Visualization of clusters
p_umap = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size) + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  AddStyle(title="Clusters", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "bottom") +
  theme(title = element_text(size = 10)) +
  NoGrid() + 
  NoLegend()
p_umap = LabelClusters(p_umap, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)

# Visualization of singleR annotation of cells
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="SingleR.labels", pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cell)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="bottom", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p = p_umap + p2
p


Annotation diagnostics

Heatmap displays the scores for all cells across all reference labels, allowing to assess the confidence of the corresponding predicted labels. Ideally, each cell should have one score that is distinctively higher than the rest, indicating that an unambiguous assignment.

# Annotation diagnostics
# https://bioconductor.org/packages/devel/bioc/vignettes/SingleR/inst/doc/SingleR.html

p = plotScoreHeatmap(sce_ann_cells)
p



Plot displaying per-cell “deltas” (the difference between the score for the assigned label and the median across all labels). Low deltas indicate that the assignment is uncertain. The minimum threshold on the deltas is defined using an outlier-based approach. Yellow marked points represents outliers that fell below the threshold.

# Set figure height for diagnostic plot
number_fig_dig = length(unique(sce_ann_cells$pruned.labels))
fig_height_dig = ceiling(number_fig_dig/6)*3
# Annotation diagnostics
p = plotDeltaDistribution(sce_ann_cells, ncol = 6) +
  NoLegend()
p

number_pruned_table = table(is.na(sce_ann_cells$pruned.labels))
number_pruned_table[3]=round((number_pruned_table[1]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
number_pruned_table[4]=round((number_pruned_table[2]/(number_pruned_table[1]+number_pruned_table[2])*100),2)
names(number_pruned_table) = c("assigned", "ambiguous", "% assigned", "% ambiguous")

number_pruned_table = (t(number_pruned_table))
knitr::kable(number_pruned_table, align="l", caption="Number of annotated cells") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) 
Number of annotated cells
assigned ambiguous % assigned % ambiguous
1043 35 96.75 3.25


Annotation of clusters

UMAPs display cells colored by Seurat clusters and cell types annotated by SingleR. Annotation was performed for each cluster as unit.

# Set to annotation metadata column
sc$annotation = sc$SingleR.cluster.labels

# Set colors for cell types based on annotation 
if (!is.null(sc@meta.data[["annotation"]])) {
  if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
    message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
    annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation)) 
    sc = annotation_colours_out[[1]]
    param$col_annotation = annotation_colours_out[[2]]
  }
}


# Print table with cell annotation
knitr::kable(annotated_clusters, align="l", caption="Cell types assigned to clusters", format="html") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="300px")
Cell types assigned to clusters
Var1 Freq
B-cells 1
CD4+ T-cells 1
CD8+ T-cells 2
Monocytes 2
NK cells 1


Coloured by cell type

# Visualization of singleR annotation of clusters
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="annotation", cols = param$col_annotation, pt.size=param$pt_size) + 
  AddStyle(title="Cell types \n(annotation per cluster)", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p2$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p2$data), ]
p2 = LabelClusters(p2, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)

p = p_umap + p2
p

Coloured by cell type (per sample)

# Visualization of singleR annotation of clusters separately per sample
p = Seurat::DimPlot(sc, reduction="umap", group.by="annotation", split.by = "orig.ident", cols = param$col_annotation, pt.size=param$pt_size) + 
  AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2", legend_position="right", legend_title="Cell types") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel = TRUE)
p



Fraction of sample per cell type

# Set 
cell_samples = levels(sc$orig.ident)
cell_celltype = unique(sc$annotation)

# Make count table
tbl = dplyr::count(sc[[c("orig.ident", "annotation")]], orig.ident, annotation) %>% tidyr::pivot_wider(names_from="annotation", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"])
tbl[,"orig.ident"] = NULL

tbl_bar = tbl[cell_samples, , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Celltype", values_to="Counts")
tbl_bar$Counts = as.numeric(tbl_bar$Counts)
tbl_bar$Celltype <- factor(tbl_bar$Celltype,levels = cell_celltype)
tbl_bar$Sample <- factor(tbl_bar$Sample, levels = cell_samples)

# Plot counts
p1 = ggplot(tbl_bar, aes(x=Celltype, y=Counts, fill=Sample)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="",
           fill=param$col_samples,
           legend_title="Sample",
           legend_position="bottom") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Make table with percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t() %>% as.data.frame()

tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Celltype", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Celltype <- factor(tbl_bar_perc$Celltype,levels = cell_celltype)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)

# Plot percentages
p2 = ggplot(tbl_bar_perc, aes(x=Celltype, y=Percentage, fill=Sample)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="",
           fill=param$col_samples,
           legend_title="Sample",
           legend_position="") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p = p1 + p2
p


Fraction of cell types per sample

# Plot counts
p1 = ggplot(tbl_bar, aes(x=Sample, y=Counts, fill=Celltype)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="", xlab = "",
           fill=param$col_annotation,
           legend_title="Sample",
           legend_position="bottom") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Calculate percentages
tbl_perc = round(t(tbl) / colSums(t(tbl)) * 100, 2) %>% t() %>% as.data.frame()
tbl_perc = round(tbl / rowSums(tbl) * 100, 2) %>% as.data.frame()

# Plot percentages
tbl_bar_perc = tbl_perc[cell_samples, , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(cols = tidyselect::all_of(cell_celltype), names_to="Cell_type", values_to="Percentage")
tbl_bar_perc$Percentage = as.numeric(tbl_bar_perc$Percentage)
tbl_bar_perc$Cell_type <- factor(tbl_bar_perc$Cell_type,levels = cell_celltype)
tbl_bar_perc$Sample <- factor(tbl_bar_perc$Sample, levels = cell_samples)

p2 = ggplot(tbl_bar_perc, aes(x=Sample, y=Percentage, fill=Cell_type)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="", xlab = "",
           fill=param$col_annotation,
           legend_position="") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p = p1 + p2 
p

p = ggplot(data = tbl_bar_perc, aes(x = Cell_type, y = Sample)) +
  geom_point(aes(color = Percentage, size = ifelse(Percentage==0, NA, Percentage))) +
  labs(col="Percentage", size="") + 
  scale_colour_gradient2(low="navy", mid="steelblue", high="darkgoldenrod1") +
  #viridis::scale_color_viridis(option = "D") +
  AddStyle(title="", ylab="Sample", legend_position="bottom") + 
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
  guides(size=guide_legend(order=1))

p



Data export

# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))

# Add parameter
Seurat::Misc(sc, "parameters") = param

# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))
### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
### Convert data
################################################################################

### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) { 
  
  # Export reductions (umap, pca, others)
  for(r in Seurat::Reductions(sc)) {
    write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
                file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
  }
  
  # Export categorical metadata
  loupe_meta = sc@meta.data
  idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
  write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"), 
              file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}



### Export as andata object (h5ad file)
# Convert Seurat v5 single cell object to anndata object
# Does not work for SCT at the moment
# However, it would contain counts and data layer
#scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", assay="RNA", 
#                        main_layer = "data", other_layers = "counts", transer_dimreduc = TRUE, verbose = FALSE)

# scesy only worked for v3 and v4 objects
# Convert to V3/4/Assay structure
sc_v3 = scCustomize::Convert_Assay(seurat_object = sc, convert_to = "V3", assay = "RNA")

# Convert Seurat v3 single cell object to anndata object
adata = sceasy::convertFormat(sc_v3, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))
# Write to h5ad file
adata$write(file.path(param$path_out, "data", "sc_anndata.h5ad"), compression="gzip")



### Export count matrix
invisible(ExportSeuratAssayData(sc, 
                                dir=file.path(param$path_out, "data", "counts"), 
                                assays=param$assay_raw, 
                                slot="counts",
                                include_cell_metadata_cols=colnames(sc[[]]),
                                metadata_prefix=paste0(param$project_id, ".")))


### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)
Export seurat object

We export the assay data, cell metadata, clustering and visualization.

Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse matrix)

Export as andata object

We export the assay data, cell metadata, clustering and visualization in andata format.

Result file:
* sc.h5ad: H5AD object

Export to Loupe Cell Browser

If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser).

Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell cycle phases

Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.





Parameters and software versions

The following parameters were used to run the workflow.

Parameters
out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
path_to_git /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis
path_to_basic_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/basic_settings.R
path_to_advanced_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/advanced_settings.R
scriptname scripts/pre-processing/cluster_analysis.Rmd
author kosankem
assay_raw RNA
downsample_cells_equally FALSE
cell_filter nFeature_RNA:20, NA; nCount_RNA:200, 20000; percent_mt:0, 18; Sample1:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18); Sample2:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18)
feature_filter min_counts:1; min_cells:5; Sample1:min_counts=1, min_cells=5; Sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
integrate_samples method:merge; dimensions:30; k_anchor:10; k_weight:100; integration_function:CCAIntegration
experimental_groups homogene
pc_n 8
cluster_k 20
umap_k 30
cluster_resolution_test 0.5, 0.7, 0.8
cluster_resolution 0.6
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
enrichr_site Enrichr
enrichr_padj 0.05
col_palette_samples ggsci::pal_igv
col_palette_clusters ggsci::pal_igv
col_palette_annotation ggsci::pal_ucscgb
col #0086b3
col_bg #D3D3D3
pt_size 0.5
project_id Testdata
data /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing/data/sc.rds
path_data name:Sample1, Sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/
species human
path_out /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis
debugging_mode default_debugging
mart_dataset hsapiens_gene_ensembl
mt ^MT-
enrichr_dbs GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024
annotation_dbs BlueprintEncodeData()
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
path_reference /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98
reference hsapiens_gene_ensembl.v98.annot.txt
file_annot /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx
col_samples Sample1=#5050FFFF, Sample2=#CE3D32FF
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF
col_annotation Monocytes=#FF0000FF, CD8+ T-cells=#FF9900FF, CD4+ T-cells=#FFCC00FF, B-cells=#00FF00FF, NK cells=#6699FFFF

This report was generated using the scripts/pre-processing/cluster_analysis.Rmd script. Software versions were collected at run time.

Software versions
out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Value
Run on: Tue Sep 03 04:34:13 PM 2024
ktrns/scrnaseq 38ca32c4ba344d783cf79821eddac5495563f985
Container NA
R R version 4.2.1 (2022-06-23)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.6 LTS
Host name hpc-rc11
Host OS #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic)
Packages abind1.4-5, AnnotationDbi1.60.2, AnnotationHub3.6.0, ape5.8, assertthat0.2.1, backports1.4.1, beachmat2.14.2, beeswarm0.4.0, bibtex0.5.1, Biobase2.58.0, BiocFileCache2.6.1, BiocGenerics0.44.0, BiocManager1.30.22, BiocParallel1.32.6, BiocSingular1.14.0, BiocVersion3.16.0, Biostrings2.66.0, bit4.0.5, bit644.0.5, bitops1.0-7, blob1.2.4, bslib0.6.1, cachem1.0.8, celldex1.8.0, checkmate2.3.1, circlize0.4.16, cli3.6.2, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, crayon1.5.2, curl5.2.1, data.table1.15.2, DBI1.2.2, dbplyr2.2.1, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, enrichR3.2, evaluate0.23, ExperimentHub2.6.0, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, filelock1.0.3, fitdistrplus1.1-11, forcats1.0.0, future1.33.1, future.apply1.11.1, generics0.1.3, GenomeInfoDb1.34.9, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggprism1.0.5, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, glmGamPoi1.10.2, GlobalOptions0.1.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gridGraphics0.5-1, gtable0.3.4, highr0.10, hms1.1.3, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, interactiveDisplayBase1.36.0, IRanges2.32.0, irlba2.3.5.1, janitor2.2.0, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KEGGREST1.38.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, MAST1.27.1, Matrix1.6-5, MatrixGenerics1.10.0, matrixStats1.1.0, memoise2.0.1, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, paletteer1.6.0, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pheatmap1.0.12, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, prettyunits1.2.0, progress1.2.3, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, ragg1.2.7, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RCurl1.98-1.14, RefManageR1.4.0, rematch22.1.2, renv0.16.0, reshape21.4.4, reticulate1.35.0, rjson0.2.21, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, RSQLite2.3.5, rstudioapi0.15.0, rsvd1.0.5, Rtsne0.17, S4Vectors0.36.2, sass0.4.8, ScaledMatrix1.6.0, scales1.3.0, scattermore1.2, scCustomize2.1.2, sceasy0.0.7, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shape1.4.6.1, shiny1.8.0, SingleCellExperiment1.20.1, SingleR2.0.0, snakecase0.11.1, sp2.1-3, spam2.10-0, sparseMatrixStats1.10.0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, SummarizedExperiment1.28.0, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, textshaping0.3.7, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, WriteXLS6.7.0, xfun0.41, xml21.3.6, xtable1.8-4, XVector0.38.0, yaml2.3.8, zip2.3.1, zlibbioc1.44.0, zoo1.8-12


Credits and References

This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))
Coenen, A., Pearce, A., 2024. Understanding UMAP.
Hao, Y., Hao, S., Andersen-Nissen, E., III, W.M.M., Zheng, S., Butler, A., Lee, M.J., Wilk, A.J., Darby, C., Zagar, M., Hoffman, P., Stoeckius, M., Papalexi, E., Mimitou, E.P., Jain, J., Srivastava, A., Stuart, T., Fleming, L.B., Yeung, B., Rogers, A.J., McElrath, J.M., Blish, C.A., Gottardo, R., Smibert, P., Satija, R., 2021. Integrated analysis of multimodal single-cell data. Cell. https://doi.org/10.1016/j.cell.2021.04.048
Hao, Y., Stuart, T., Kowalski, M.H., Choudhary, S., Hoffman, P., Hartman, A., Srivastava, A., Molla, G., Madad, S., Fernandez-Granda, C., Satija, R., 2023. Dictionary learning for integrative, multimodal and scalable single-cell analysis. Nature Biotechnology. https://doi.org/10.1038/s41587-023-01767-y
Traag, V.A., Waltman, L., van Eck, N.J., 2019. From louvain to leiden: Guaranteeing well-connected communities. Scientific Reports 9. https://doi.org/10.1038/s41598-019-41695-z